library(limma)
library(Glimma)
library(edgeR)
library(dplyr)
library(magrittr)
setwd('~/Desktop/USC/Fall_2020/TRGN_510/510_adenocarcinoma_vs_squamous_cell_carcinoma')
# Reading in count data
count_matrix = read.table(file = 'count_processed.txt',sep = '\t',header = T,stringsAsFactors = F, check.names = F, row.names = 1)
count_matrix = count_matrix[,order(names(count_matrix))]
#Reading in clinical metadata
clinical_metadata = read.table(file = 'clinical_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
clinical_metadata = clinical_metadata[!duplicated(clinical_metadata[ , c("case_submitter_id")]),]
# Reading in exposure metadata
exposure_metadata = read.csv(file = 'exposure_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
# Merging metadata
all_metadata = merge(x = clinical_metadata, y = exposure_metadata, by = 'case_submitter_id')
all_metadata$ethnicity <- as.factor(all_metadata$ethnicity)
all_metadata$gender <- as.factor(all_metadata$gender)
all_metadata%<>%
mutate(diagnosis=case_when(
primary_diagnosis %in% c("Adenocarcinoma with mixed subtypes","Adenocarcinoma, NOS","Bronchiolo-alveolar adenocarcinoma, NOS","Papillary adenocarcinoma, NOS") ~ c("Adenocarcinoma"),
primary_diagnosis %in% c("Squamous cell carcinoma, large cell, nonkeratinizing, NOS","Squamous cell carcinoma, NOS") ~ c("Squamous_cell_carcinoma")
))
all_metadata$diagnosis = as.factor(all_metadata$diagnosis)
all_metadata$race <- as.factor(all_metadata$race)
rownames(all_metadata) <- all_metadata$case_submitter_id
all_metadata = all_metadata[,-1]
all_metadata = all_metadata[order(rownames(all_metadata)),]
# Creating DGEList Object
geneExpr = DGEList(counts = count_matrix, samples = all_metadata)
geneExpr$samples$group=all_metadata$diagnosis
geneid = rownames(geneExpr)
# Importing gencode reference from DGC website
gencode = read.table('gencode.gene.info.v22.tsv',sep = '\t',header = T,stringsAsFactors = F, check.names = F)
genes = gencode[geneid %in% gencode$gene_id, ]
genes = genes[,c("gene_id","gene_name","seqname")]
rownames(genes) = genes$gene_id
genes = genes[order(genes$gene_id),]
geneExpr$genes = genes[,-1]
cpm <- cpm(geneExpr)
lcpm <- cpm(geneExpr, log=TRUE)
L <- mean(geneExpr$samples$lib.size) * 1e-6
M <- median(geneExpr$samples$lib.size) * 1e-6
c(L,M)
[1] 52.92581 48.81745
The average library size for this dataset is about 61.1 million.
table(rowSums(geneExpr$counts==0)==75)
FALSE
60483
keep.exprs <- filterByExpr(geneExpr, group=geneExpr$samples$group)
Around 11% of genes in the dataset have zero counts across all 75 samples.
geneExpr <- geneExpr[keep.exprs,, keep.lib.sizes=FALSE]
dim(geneExpr)
[1] 21170 40
In this dataset, the median library size is 61.5 million and 10/61.5 is about 0.2, therefore the filterByExpr function keeps genes that have a CPM of 0.2 or more. With this cutoff the number of genes are reduced to 22200, about 36% of genes from what I started with.
lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(geneExpr)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.55), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
lcpm_filtered <- cpm(geneExpr, log=TRUE)
plot(density(lcpm_filtered[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm_filtered[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
This graph showed that before filtering the data, a large portion of genes within each samples are lowly-expressed with log-CPM values that are small or negative.
geneExpr = calcNormFactors(geneExpr, method = "TMM")
geneExpr$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345 0.8890663 0.6747847
[13] 0.9553945 1.1161617 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621 0.8191521 1.0129474 0.9248736 1.1260210
[25] 1.0694870 0.7173413 1.0284701 1.0027376 1.0325592 1.1019957 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716
[37] 1.1264297 1.0857329 1.0489496 1.0082920
geneExpr_unnormalized <- geneExpr
geneExpr_unnormalized$samples$norm.factors <- 1
par(mfrow=c(1,2))
lcpm_unnormalized <- cpm(geneExpr_unnormalized, log=TRUE)
boxplot(lcpm_unnormalized, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
geneExpr_normalized <- calcNormFactors(geneExpr_unnormalized)
geneExpr_normalized$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345 0.8890663 0.6747847
[13] 0.9553945 1.1161617 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621 0.8191521 1.0129474 0.9248736 1.1260210
[25] 1.0694870 0.7173413 1.0284701 1.0027376 1.0325592 1.1019957 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716
[37] 1.1264297 1.0857329 1.0489496 1.0082920
lcpm_normalized <- cpm(geneExpr_normalized, log=TRUE)
boxplot(lcpm_normalized, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
This figure showed that after normalization the samples are more similar to each other.
lcpm <- cpm(geneExpr, log=TRUE)
par(mfrow=c(1,2))
col.group <- geneExpr$samples$group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
minimal value for n is 3, returning requested palette with 3 different levels
col.group <- as.character(col.group)
col.race <- geneExpr$samples$race
levels(col.race) <- brewer.pal(nlevels(col.race), "Set2")
col.race <- as.character(col.race)
col.gender <- geneExpr$samples$gender
levels(col.gender) <- brewer.pal(nlevels(col.gender), "Set3")
minimal value for n is 3, returning requested palette with 3 different levels
col.gender <- as.character(col.gender)
col.age <- geneExpr$samples$age_at_diagnosis
levels(col.age) <- brewer.pal(nlevels(col.age), "Set1")
minimal value for n is 3, returning requested palette with 3 different levels
col.age <- as.character(col.age)
col.ethnicity <- geneExpr$samples$ethnicity
levels(col.ethnicity) <- brewer.pal(nlevels(col.ethnicity), "Set2")
minimal value for n is 3, returning requested palette with 3 different levels
col.ethnicity <- as.character(col.ethnicity)
col.cigar_per_day <- geneExpr$samples$cigarettes_per_day
levels(col.cigar_per_day) <- brewer.pal(nlevels(col.cigar_per_day), "Set3")
minimal value for n is 3, returning requested palette with 3 different levels
col.cigar_per_day <- as.character(col.cigar_per_day)
plotMDS(lcpm,labels=geneExpr$samples$group, col=col.group)
title(main="A. Sample groups")
plotMDS(lcpm, labels=geneExpr$samples$gender, col=col.gender, dim=c(3,4))
title(main="B. Sex")
plotMDS(lcpm, labels=geneExpr$samples$age_at_diagnosis, col=col.age, dim=c(3,4))
title(main="C. Age at diagnosis")
plotMDS(lcpm, labels=geneExpr$samples$race, col=col.race, dim=c(3,4))
title(main="D. Race")
plotMDS(lcpm, labels=geneExpr$samples$ethnicity, col=col.ethnicity, dim=c(3,4))
title(main="E. Ethnicity")
plotMDS(lcpm, labels=geneExpr$samples$cigarettes_per_day, col=col.cigar_per_day, dim=c(3,4))
title(main="F. Cigarettes per day")
In the MDS plot, there is no separation between group of patients with early stage lung cancer and late stage lung cancer.
diagnosis = geneExpr$samples$group
ethnicity = geneExpr$samples$ethnicity
sex = geneExpr$samples$gender
race = geneExpr$samples$race
age_at_diagnosis = geneExpr$samples$age_at_diagnosis
smoking_hist = geneExpr$samples$cigarettes_per_day
design = model.matrix(~0+diagnosis+sex)
colnames(design) <- gsub("diagnosis", "", colnames(design))
contr.matrix <- makeContrasts(
AdenocarcinomavsSquamous_cell_carcinoma = Adenocarcinoma-Squamous_cell_carcinoma,
levels = colnames(design))
contr.matrix
Contrasts
Levels AdenocarcinomavsSquamous_cell_carcinoma
Adenocarcinoma 1
Squamous_cell_carcinoma -1
sexmale 0
par(mfrow=c(1,2))
v <- voom(geneExpr, design, plot=TRUE)
v
An object of class "EList"
$genes
21165 more rows ...
$targets
35 more rows ...
$E
TCGA-22-4613 TCGA-22-5489 TCGA-22-5491 TCGA-33-AAS8 TCGA-34-5231 TCGA-43-7656 TCGA-43-8118 TCGA-44-5645
ENSG00000000003.13 6.578393 4.518445 7.050818 5.481583 6.016010 5.138312 5.436772 5.635659
ENSG00000000419.11 5.006541 5.125550 6.009261 5.535389 5.227853 6.273798 5.677880 4.093028
ENSG00000000457.12 3.899691 4.351443 4.164341 3.347290 4.310104 4.022563 3.425331 5.427188
ENSG00000000460.15 3.720341 4.789466 4.070052 3.040155 4.109311 3.920045 3.333184 4.295921
ENSG00000000938.11 4.311517 4.032854 2.267258 2.617683 2.708592 3.026093 5.302423 2.653449
TCGA-44-A47G TCGA-46-3765 TCGA-46-3766 TCGA-49-4486 TCGA-49-4487 TCGA-49-AARO TCGA-50-5942 TCGA-50-5946
ENSG00000000003.13 5.702467 6.113541 6.379925 6.279390 5.972862 6.359283 5.745927 5.480757
ENSG00000000419.11 4.281239 6.092018 5.071231 5.475511 5.916025 4.660764 4.466833 5.072669
ENSG00000000457.12 3.977077 3.866716 3.928589 4.975262 4.130680 3.932304 4.777173 4.218002
ENSG00000000460.15 2.528768 3.742487 2.508825 2.713849 4.464368 2.723976 1.971207 3.939701
ENSG00000000938.11 5.680948 3.611814 4.778872 2.166361 5.063277 5.362237 2.672767 1.804396
TCGA-50-8457 TCGA-50-8460 TCGA-55-7726 TCGA-55-8089 TCGA-56-5897 TCGA-56-A4ZJ TCGA-56-A5DR TCGA-63-A5MH
ENSG00000000003.13 5.061664 5.801764 5.618804 5.648883 4.869300 5.409876 5.176682 5.836250
ENSG00000000419.11 4.197116 4.935787 6.016039 5.237966 5.088500 5.088331 5.362201 5.024777
ENSG00000000457.12 4.293238 3.792598 3.814772 4.220320 3.935572 3.573813 4.258951 4.356796
ENSG00000000460.15 2.450756 1.987285 3.384872 3.679206 3.653911 2.581473 4.020476 4.482014
ENSG00000000938.11 4.286484 4.685892 3.749970 5.450806 4.716595 5.280737 2.910954 2.161463
TCGA-63-A5MY TCGA-64-1676 TCGA-68-8250 TCGA-73-4662 TCGA-77-A5G7 TCGA-85-8048 TCGA-86-7953 TCGA-86-8076
ENSG00000000003.13 5.890766 7.721230 6.362591 6.141768 5.962677 6.170662 5.978361 5.688321
ENSG00000000419.11 6.699551 6.669445 6.051110 4.699274 6.046526 5.186354 4.206129 4.600721
ENSG00000000457.12 4.139408 4.442916 3.510752 5.172006 4.208445 4.329645 3.843559 4.470898
ENSG00000000460.15 4.332709 3.469883 3.347400 4.412238 4.220873 4.220775 4.150741 2.488662
ENSG00000000938.11 2.534827 4.735432 4.758486 4.545843 2.619316 5.029770 4.653009 4.916381
TCGA-90-7766 TCGA-91-6828 TCGA-93-A4JO TCGA-96-7545 TCGA-98-A53C TCGA-99-AA5R TCGA-NJ-A4YQ TCGA-O1-A52J
ENSG00000000003.13 6.386395 5.854894 6.055768 6.086418 4.356866 4.898964 5.288840 5.802251
ENSG00000000419.11 5.411248 4.413630 5.220789 5.534683 4.524946 4.397344 4.731263 5.020307
ENSG00000000457.12 4.452007 4.641835 4.296636 3.867595 3.721552 4.155948 4.791851 4.284247
ENSG00000000460.15 3.854044 2.833918 3.301439 2.842824 2.332691 2.017264 3.217170 3.054791
ENSG00000000938.11 3.879929 4.778828 5.055971 4.323060 5.647853 6.152786 4.741985 5.868203
21165 more rows ...
$weights
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
[1,] 1.9156382 2.265171 2.264350 2.213717 2.251193 2.259541 2.213335 2.134201 2.196382 2.178912 2.103449 2.260762 1.9845015
[2,] 1.7274124 2.263898 2.259424 2.130467 2.259430 2.249206 2.129639 1.709450 1.843064 2.068303 1.956363 2.159220 1.4994892
[3,] 1.1247909 1.889080 1.831902 1.519393 2.058846 1.750188 1.518138 1.492255 1.626831 1.428855 1.303977 1.918998 1.3029032
[4,] 0.9981961 1.764861 1.705691 1.323075 1.957294 1.621630 1.321983 1.007585 1.085743 1.245279 1.141339 1.377043 0.9075725
[5,] 1.2599568 1.638856 1.578728 1.704841 1.839755 1.497199 1.703511 1.700961 1.835126 1.608547 1.469780 1.696029 1.4916402
[,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
[1,] 2.199452 2.147720 2.261012 2.239719 2.192143 1.912564 2.265181 2.254155 2.229398 2.217851 2.221969 2.247060 2.0726890
[2,] 1.850504 1.736032 2.236428 1.982529 1.936786 1.422262 2.210388 2.240132 2.157465 2.188616 2.194131 2.229313 1.7227708
[3,] 1.634603 1.518237 2.096214 1.780240 1.608988 1.237064 2.026705 1.700806 1.572091 1.526228 1.539838 1.649835 1.3955026
[4,] 1.090570 1.022096 1.590228 1.189021 1.130158 0.874648 1.494592 1.571770 1.369460 1.405102 1.417681 1.522205 0.9983615
[5,] 1.842526 1.728006 1.913948 1.975738 1.388865 1.414733 1.821489 1.450037 1.757968 1.295786 1.307260 1.403482 1.2056881
[,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38] [,39]
[1,] 2.238861 2.258105 2.162523 2.207645 2.263954 2.264046 2.260432 2.0272364 2.252693 2.259175 2.235016 2.216680 2.171008
[2,] 2.217553 2.217851 2.116843 2.174970 2.125441 2.182022 2.229632 1.6608334 2.118335 2.248536 2.170787 1.901291 1.781816
[3,] 1.609251 2.124609 1.389669 1.493012 1.965746 1.964307 1.768653 1.3415540 1.847571 1.745479 1.598156 1.688464 1.563408
[4,] 1.483421 1.549154 1.279861 1.374249 1.347076 1.422712 1.554026 0.9676036 1.311413 1.616765 1.392875 1.125273 1.047906
[5,] 1.367369 2.215406 1.183066 1.267782 2.120177 1.746381 1.945727 1.1615881 1.619536 1.492648 1.784314 1.893762 1.773524
[,40]
[1,] 2.221055
[2,] 1.914839
[3,] 1.703532
[4,] 1.134973
[5,] 1.907243
21165 more rows ...
$design
Adenocarcinoma Squamous_cell_carcinoma sexmale
1 0 1 0
2 0 1 1
3 0 1 1
4 0 1 0
5 0 1 1
35 more rows ...
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
summary(decideTests(efit))
AdenocarcinomavsSquamous_cell_carcinoma
Down 2482
NotSig 16700
Up 1988
res = topTable(efit,sort.by = "P",n=Inf)
head(res)
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
summary(dt)
AdenocarcinomavsSquamous_cell_carcinoma
Down 320
NotSig 20769
Up 81
plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1],
xlim=c(-8,13))
glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1],
side.main="gene_name", counts=lcpm, groups=geneExpr$samples$group, launch=FALSE)
library(gplots)
par(mar = rep(2, 4))
Adenocarcinoma.vs.SquamousCell.topgenes <- res$gene_name[1:100]
i <- which(v$genes$gene_name %in% Adenocarcinoma.vs.SquamousCell.topgenes)
mycol <- colorpanel(1000,"blue","white","red")
heatmap.2(lcpm[i,], scale="row",
labRow=v$genes$gene_name[i], labCol=v$targets$group,
col=mycol, trace="none", density.info="none",
margin=c(8,12), lhei=c(2,10), dendrogram="column")
load("human_c2_v5p2.rdata")
#idx <- ids2indices(Hs.c6,id=v$genes$gene_name)
library(Homo.sapiens)